Regression exploration

source("functions.R")

1 Introduction

1.1 Nettoyage des donnƩes

On commence par importer le jeu de donnĆ©es et on vĆ©rifie si il y’a des valeurs manquantes, ce qui n’est pas le cas. On peut donc continuer avec l’analyse des donnĆ©es en vĆ©rifiant le type des variables:

On va transformer bonus_malus en binaire et retirer les variables qui ne sont pas utiles pour la prƩdiction comme PoliId.

library(rmarkdown)
library(dplyr)

# importation des donnƩes
train <- read.csv("./data/train_set.csv", header = T, sep = ",", dec = ".")
test <- read.csv("./data/test_set.csv", header = T, sep = ",", dec = ".")



# valeurs manquantes
sum(is.na(train))
## [1] 0
# On va transformer bonus_malus en binaire
train$Bonus_Malus <- ifelse(train$Bonus_Malus < 100, "Bonus", "Malus")
test$Bonus_Malus <- ifelse(test$Bonus_Malus < 100, "Bonus", "Malus")
train <- train %>%
    select(-PolID)
test <- test %>%
    select(-PolID)

# appercu des donnƩes
paged_table(train)

On peut maintenant continuer avec l’analyse des donnĆ©es en vĆ©rifiant le type des variables:

library(kableExtra)

variables <- classifier_variables_tab(train)
numeric_variables <- data.frame(variables_numƩriques = variables$variables_numeriques)
categorical_variables <- data.frame(variables_catƩgorielles = append(variables$variables_categorielles,
    variables$variables_binaires))

# categorical_variables %>%
kable(categorical_variables) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
        full_width = FALSE)
variables_catƩgorielles
Car_Model
Urban_rural_class
French_region
Bonus_Malus
Car_Fuel
# numeric_variables %>%
kable(numeric_variables) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
        full_width = FALSE)
variables_numƩriques
Claim
Period_Exp
Car_Power
Car_Age
Age
Inhab_density
# On va convertir les variables catƩgorielles en facteur on Obtient alors:


variables <- classifier_variables_tab(train)
numeric_variables <- variables$variables_numeriques
categorical_variables <- append(variables$variables_categorielles, variables$variables_binaires)
# convertir les varianles catƩgorielles en factor
train[categorical_variables] <- lapply(train[categorical_variables], factor)
test[categorical_variables] <- lapply(test[categorical_variables], factor)
str(train)
## 'data.frame':    542389 obs. of  11 variables:
##  $ Claim            : int  4 5 8 4 11 4 0 0 0 0 ...
##  $ Period_Exp       : num  0.56 1 0.41 0.27 0.08 0.1 0.96 0.73 0.09 0.73 ...
##  $ Car_Power        : int  4 7 4 5 4 4 14 10 4 5 ...
##  $ Car_Age          : int  4 9 12 9 13 1 25 2 12 4 ...
##  $ Age              : int  46 67 52 23 53 31 49 38 27 32 ...
##  $ Bonus_Malus      : Factor w/ 2 levels "Bonus","Malus": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Car_Model        : Factor w/ 11 levels "B1","B10","B11",..: 9 7 1 8 1 4 2 4 10 8 ...
##  $ Car_Fuel         : Factor w/ 2 levels "Diesel","Regular": 1 1 2 1 2 2 2 1 2 1 ...
##  $ Urban_rural_class: Factor w/ 6 levels "A","B","C","D",..: 1 5 4 5 4 5 5 3 3 3 ...
##  $ Inhab_density    : int  29 4762 824 6924 824 2983 5053 160 229 461 ...
##  $ French_region    : Factor w/ 22 levels "Alsace","Aquitaine",..: 7 21 13 12 13 17 12 20 6 6 ...

1.2 Ɖtude des variables catĆ©gorielles:

1.2.1 Car Model

plot_categorical(train, "Car_Model")

plot_percentage(train, "Car_Model")

plot_hist_by_claim(train, "Car_Model")

1.2.2 Bonus_Malus

plot_categorical(train, "Bonus_Malus")

plot_percentage(train, "Bonus_Malus")

plot_hist_by_claim(train, "Bonus_Malus")

1.2.3 Urban_rural_class

plot_categorical(train, "Urban_rural_class")

plot_percentage(train, "Urban_rural_class")

plot_hist_by_claim(train, "Urban_rural_class")

1.2.4 Car_Fuel

plot_categorical(train, "Car_Fuel")

plot_percentage(train, "Car_Fuel")

plot_hist_by_claim(train, "Car_Fuel")

library(vcd)
mosaic(~Car_Fuel + Bonus_Malus, data = train, shade = TRUE)

# assocplot(table(train$Car_Fuel, train$Bonus_Malus))

1.2.5 French_region

source("functions.R")
plot_categorical(train, "French_region")

plot_percentage(train, "French_region")

plot_hist_by_claim(train, "French_region")

plot_claims_by_region(train, "./data/regions-avant-redecoupage-2015.geojson")

1.3 Ɖtude des variables numĆ©riques

1.3.1 Inhab_density

plot_numeric <- function(data, variable) {
    p1 <- ggplot(data, aes_string(x = variable)) + geom_histogram(aes(y = ..density..),
        bins = 30, fill = "lightblue", color = "black") + geom_density(alpha = 0.2,
        fill = "#FF6666") + labs(title = paste("Distribution de la variable", variable)) +
        theme_bw()
    p2 <- ggplot(data, aes_string(x = variable)) + geom_boxplot(fill = "lightblue",
        color = "black") + labs(title = paste("Boxplot de la variable", variable)) +
        theme_bw()

    p3 <- ggplot(train, aes(x = .data[[variable]], y = Claim)) + geom_point(alpha = 0.6,
        color = "darkorange") + labs(title = paste("Relation entre", variable, "et nombre de sinistres"),
        x = variable, y = "Nombre de sinistres") + theme_minimal()

    print(p1)
    print(p2)
    print(p3)
}

box_plot <- function(data, col) {
    data$Claim <- as.factor(data$Claim)

    p1 <- ggplot(data, aes(x = Claim, y = .data[[col]], fill = Claim)) + geom_boxplot() +
        labs(title = paste("Distribution de", col, " par Claim"), x = "Claim", y = col) +
        theme_bw()

    # Histogram with 20 bins Histogram
    p2 <- ggplot(data, aes(x = .data[[col]], fill = Claim)) + geom_histogram(color = "black",
        bins = 20, alpha = 1) + labs(title = paste("Histogramme de", col, "par Claim"),
        x = col, y = "Nombre") + theme_bw()

    return(p2)
}


plot_numeric(train, "Inhab_density")

box_plot(train, "Inhab_density")

print(sum(train$Inhab_density < 1))
## [1] 0

1.3.2 Age

plot_numeric(train, "Age")

print(sum(train$Age > 80))
## [1] 4943

1.3.3 Car_Age

plot_numeric(train, "Car_Age")

1.3.4 Car_Power

plot_numeric(train, "Car_Power")

1.3.5 Period_Exp

plot_numeric(train, "Period_Exp")

1.4 Analyse de la target

source("functions.R")
plot_percentage(train, "Claim", 5)

## Analyse des corrƩlations

Une heatmap pour visualiser les corrƩlations entre les variables numƩriques.

library(reshape2)
library(corrplot)

# Distribution des variables numƩriques
num_vars <- train[, c("Claim", "Period_Exp", "Car_Power", "Car_Age", "Age", "Inhab_density")]


corr_matrix <- cor(num_vars)
melted_cor <- melt(corr_matrix)
ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
    scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0) +
    labs(title = "Heatmap des corrƩlations", x = "", y = "")

corrplot(corr_matrix, method = "circle")

# Sauvegarder l'ensemble d'entraƮnement
write.csv(train, "./data/train_set_clean.csv", row.names = FALSE)

# Sauvegarder l'ensemble de test
write.csv(test, "./data/test_set_clean.csv", row.names = FALSE)